Setup

resultsPath=file.path(getwd(),"Results")
# Gather parameters from command line  
#dir.create(file.path(resultsPath,"cache"), showWarnings=F, recursive=T)
nCores <- parallel::detectCores()#params$nCores
subsetGenes <- params$subsetGenes 
subsetCells <- params$subsetCells 
resolution <-  as.numeric(params$resolution)
interactive <- params$interactive 

root <- getwd()
# Have to setwd via knitr
# knitr::opts_knit$set(root.dir=resultsPath, child.path = resultsPath)
knitr::opts_chunk$set(echo=T, error=T, root.dir = resultsPath 
                      # cache=T, cache.lazy=T
                      ) 

# Utilize parallel processing later on
cat(paste("**** Utilized Cores **** =", nCores))   
## **** Utilized Cores **** = 4
params
## $subsetGenes
## [1] "protein_coding"
## 
## $subsetCells
## [1] 500
## 
## $resolution
## [1] 0.6
## 
## $resultsPath
## [1] "./"
## 
## $interactive
## [1] FALSE

** ./ **

Load Libraries & Report Versions

library(Seurat)
library(dplyr)
library(ulimit) # devtools::install_github("krlmlr/ulimit")
library(benchmarkme)
library(gridExtra)
library(knitr) 
library(plotly)
library(ggplot2)
library(viridis)
library(reshape2)
library(shiny) 
library(ggrepel)
library(DT) 
library(ComplexHeatmap); #BiocManager::install("ComplexHeatmap") 
 
# install.packages('devtools')
# devtools::install_github('talgalili/heatmaply')
## Install Bioconductor
#  if (!requireNamespace("BiocManager"))
#     install.packages("BiocManager")
# BiocManager::install(c("biomaRt"))
library(biomaRt)
# BiocManager::install(c("DESeq2"))
library(DESeq2)
# library(snow); #BiocManager::install("Rmpi") #NOTE: different lib name than install name (snow vs Rmpi)
 
createDT <- function(DF, caption="", scrollY=500){
  data <- DT::datatable(DF, caption=caption,
    extensions = list('Buttons','Scroller'),
    options = list( dom = 'Bfrtip', 
                    buttons = c('copy', 'csv', 'excel', 'pdf', 'print'), 
                    scrollY = scrollY, scrollX=T, scrollCollapse = T, paging = F,  
                      columnDefs = list(list(className = 'dt-center', targets = "_all"))
    )
  ) 
   return(data)
}
# Useful Seurat functions
## Seurat::FindGeneTerms() # Enrichr API
## Seurat::MultiModal_CCA() # Integrates data from disparate datasets (CIA version too)

sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14.2
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
##  [1] parallel  stats4    grid      stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] DESeq2_1.22.2               SummarizedExperiment_1.12.0
##  [3] DelayedArray_0.8.0          BiocParallel_1.16.5        
##  [5] matrixStats_0.54.0          Biobase_2.42.0             
##  [7] GenomicRanges_1.34.0        GenomeInfoDb_1.18.1        
##  [9] IRanges_2.16.0              S4Vectors_0.20.1           
## [11] BiocGenerics_0.28.0         biomaRt_2.38.0             
## [13] ComplexHeatmap_1.20.0       DT_0.5.1                   
## [15] ggrepel_0.8.0               shiny_1.2.0                
## [17] reshape2_1.4.3              viridis_0.5.1              
## [19] viridisLite_0.3.0           plotly_4.8.0               
## [21] knitr_1.21                  gridExtra_2.3              
## [23] benchmarkme_0.6.0           ulimit_0.0-3               
## [25] dplyr_0.7.8                 Seurat_2.3.4               
## [27] Matrix_1.2-15               cowplot_0.9.4              
## [29] ggplot2_3.1.0              
## 
## loaded via a namespace (and not attached):
##   [1] snow_0.4-3             backports_1.1.3        circlize_0.4.5        
##   [4] Hmisc_4.1-1            plyr_1.8.4             igraph_1.2.2          
##   [7] lazyeval_0.2.1         splines_3.5.1          digest_0.6.18         
##  [10] foreach_1.4.4          htmltools_0.3.6        lars_1.2              
##  [13] gdata_2.18.0           memoise_1.1.0          magrittr_1.5          
##  [16] checkmate_1.9.0        cluster_2.0.7-1        doParallel_1.0.14     
##  [19] mixtools_1.1.0         ROCR_1.0-7             annotate_1.60.0       
##  [22] R.utils_2.7.0          prettyunits_1.0.2      colorspace_1.3-2      
##  [25] blob_1.1.1             xfun_0.4               crayon_1.3.4          
##  [28] RCurl_1.95-4.11        jsonlite_1.6           genefilter_1.64.0     
##  [31] bindr_0.1.1            survival_2.43-3        zoo_1.8-4             
##  [34] iterators_1.0.10       ape_5.2                glue_1.3.0            
##  [37] gtable_0.2.0           zlibbioc_1.28.0        XVector_0.22.0        
##  [40] GetoptLong_0.1.7       kernlab_0.9-27         shape_1.4.4           
##  [43] prabclus_2.2-6         DEoptimR_1.0-8         scales_1.0.0          
##  [46] mvtnorm_1.0-8          DBI_1.0.0              bibtex_0.4.2          
##  [49] Rcpp_1.0.0             metap_1.0              dtw_1.20-1            
##  [52] progress_1.2.0         xtable_1.8-3           htmlTable_1.13.1      
##  [55] reticulate_1.10        foreign_0.8-71         bit_1.1-14            
##  [58] proxy_0.4-22           mclust_5.4.2           SDMTools_1.1-221      
##  [61] Formula_1.2-3          tsne_0.1-3             htmlwidgets_1.3       
##  [64] httr_1.4.0             gplots_3.0.1           RColorBrewer_1.1-2    
##  [67] fpc_2.1-11.1           acepack_1.4.1          modeltools_0.2-22     
##  [70] ica_1.0-2              pkgconfig_2.0.2        XML_3.98-1.16         
##  [73] R.methodsS3_1.7.1      flexmix_2.3-14         nnet_7.3-12           
##  [76] locfit_1.5-9.1         tidyselect_0.2.5       rlang_0.3.1           
##  [79] later_0.7.5            AnnotationDbi_1.44.0   munsell_0.5.0         
##  [82] tools_3.5.1            RSQLite_2.1.1          ggridges_0.5.1        
##  [85] evaluate_0.12          stringr_1.3.1          yaml_2.2.0            
##  [88] npsurv_0.4-0           bit64_0.9-7            fitdistrplus_1.0-11   
##  [91] robustbase_0.93-3      caTools_1.17.1.1       purrr_0.2.5           
##  [94] RANN_2.6.1             bindrcpp_0.2.2         pbapply_1.3-4         
##  [97] nlme_3.1-137           mime_0.6               R.oo_1.22.0           
## [100] hdf5r_1.0.1            compiler_3.5.1         rstudioapi_0.9.0      
## [103] png_0.1-7              lsei_1.2-0             geneplotter_1.60.0    
## [106] tibble_2.0.0           stringi_1.2.4          lattice_0.20-38       
## [109] trimcluster_0.1-2.1    pillar_1.3.1           Rdpack_0.10-1         
## [112] lmtest_0.9-36          GlobalOptions_0.1.0    data.table_1.11.8     
## [115] bitops_1.0-6           irlba_2.3.2            gbRd_0.4-11           
## [118] httpuv_1.4.5.1         R6_2.3.0               latticeExtra_0.6-28   
## [121] promises_1.0.1         KernSmooth_2.23-15     codetools_0.2-16      
## [124] benchmarkmeData_0.5.1  MASS_7.3-51.1          gtools_3.8.1          
## [127] assertthat_0.2.0       rjson_0.2.20           withr_2.1.2           
## [130] GenomeInfoDbData_1.2.0 hms_0.4.2              diptest_0.75-7        
## [133] doSNOW_1.0.16          rpart_4.1-13           tidyr_0.8.2           
## [136] class_7.3-15           rmarkdown_1.11         segmented_0.5-3.0     
## [139] Rtsne_0.15             base64enc_0.1-3
print(paste("Seurat ", packageVersion("Seurat")))
## [1] "Seurat  2.3.4"

Raise Memory Limit

Rstudio has a default memory limit of only 1GB. To override this, detect the true memory available and set a new limit.

RAM <- print(benchmarkme::get_ram())
## 8.59 GB
## Convert GB to Mib
RAM_Mib <- strsplit(RAM, " ")[[1]][1] %>% as.numeric() * 953.67431640625
cat(paste("Available RAM:",RAM))
## Available RAM: 8.59 GB
## Set new memory limit 
ulimit::memory_limit(RAM_Mib) 
## soft hard 
## 8192  Inf

Load Data

## ! IMPORTANT! Must not setwd to local path when launching on cluster
# setwd("~/Desktop/PD_scRNAseq/")
dir.create(file.path(root,"Data"), showWarnings=F) 
load(file.path(root,"Data/seurat_object_add_HTO_ids.Rdata"))
pbmc <- seurat.obj  
rm(seurat.obj)

Pre-filtered Dimensions

pbmc
## An object of class seurat in project RAJ_13357 
##  24914 genes across 22113 samples.

Clean Metadata

Add Metadata

metadata <- read.table(file.path(root,"Data/meta.data4.tsv"))
createDT( metadata, caption = "Metadata")  
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
# Make AgeGroups
makeAgeGroups <- function(){
  dim(metadata)
  getMaxRound <- function(vals=metadata$Age, unit=10)unit*ceiling((max(vals)/unit))
  getMinRound <- function(vals=metadata$Age, unit=10)unit*floor((min(vals)/unit)) 
   
  ageBreaks = c(seq(getMinRound(), getMaxRound(), by = 10), getMaxRound()+10)
  AgeGroupsUniq <- c()
  for (i in 1:(length(ageBreaks)-1)){ 
    AgeGroupsUniq <- append(AgeGroupsUniq, paste(ageBreaks[i],ageBreaks[i+1], sep="-")) 
  } 
  data.table::setDT(metadata,keep.rownames = T,check.names = F)[, AgeGroups := cut(Age, 
                                  breaks = ageBreaks, 
                                  right = F, 
                                  labels = AgeGroupsUniq,
                                  nclude.lowest=T)]
  metadata <- data.frame(metadata)
  unique(metadata$AgeGroups)
  head(metadata)
  dim(metadata)
  return(metadata)
}
# metadata <- makeAgeGroups()

pbmc <- AddMetaData(object = pbmc, metadata = metadata)  
# Get rid of any NAs (cells that don't match up with the metadata) 
if(subsetCells==F){
  pbmc <- FilterCells(object = pbmc,  subset.names = "nGene", low.thresholds = 0)
} else {pbmc <- FilterCells(object = pbmc,  subset.names = "nGene", low.thresholds = 0,
                    # Subset for testing 
                    cells.use = pbmc@cell.names[0:subsetCells]
                    )
}  

Filter & Normalize Data

Subset Genes by Biotype

Include only subsets of genes by type. Biotypes from: https://useast.ensembl.org/info/genome/genebuild/biotypes.html

subsetBiotypes <- function(pbmc, subsetGenes){
  if( subsetGenes!=F ){
    cat(paste("Subsetting genes:",subsetGenes))
    # If the gene_biotypes file exists, import csv. Otherwise, get from biomaRt
    if(file_test("-f", file.path(root,"Data/gene_biotypes.csv"))){
      biotypes <- read.csv(file.path(root,"Data/gene_biotypes.csv"))
    }
    else {
      ensembl <- useMart(biomart="ENSEMBL_MART_ENSEMBL", host="grch37.ensembl.org",
                       dataset="hsapiens_gene_ensembl") 
      ensembl <- useDataset(mart = ensembl, dataset = "hsapiens_gene_ensembl")
      listFilters(ensembl)
      listAttributes(ensembl)   
      biotypes <- getBM(attributes=c("hgnc_symbol", "gene_biotype"), filters="hgnc_symbol",
            values=row.names(pbmc@data), mart=ensembl) 
      write.csv(biotypes, file.path(root,"Data/gene_biotypes.csv"), quote=F, row.names=F)
    } 
    # Subset data by creating new Seurat object (annoying but necessary)
    geneSubset <- biotypes[biotypes$gene_biotype==subsetGenes,"hgnc_symbol"] 
    
    cat(paste(dim(pbmc@raw.data[geneSubset, ])[1],"/", dim(pbmc@raw.data)[1], 
                "genes are", subsetGenes))
    # Add back into pbmc 
    subset.matrix <- pbmc@raw.data[geneSubset, ] # Pull the raw expression matrix from the original Seurat object containing only the genes of interest
    pbmc_sub <- CreateSeuratObject(subset.matrix) # Create a new Seurat object with just the genes of interest
    orig.ident <- row.names(pbmc@meta.data) # Pull the identities from the original Seurat object as a data.frame
    pbmc_sub <- AddMetaData(object = pbmc_sub, metadata = pbmc@meta.data) # Add the idents to the meta.data slot
    pbmc_sub <- SetAllIdent(object = pbmc_sub, id = "ident") # Assign identities for the new Seurat object
    pbmc <- pbmc_sub
    rm(list = c("pbmc_sub","geneSubset", "subset.matrix", "orig.ident")) 
  } 
}

subsetBiotypes(pbmc, subsetGenes)
## Subsetting genes: protein_coding14827 / 24914 genes are protein_coding

Subset Cells

Filter by cells, normalize , filter by gene variability.

pbmc <- FilterCells(object = pbmc, subset.names = c("nGene", "percent.mito"), 
    low.thresholds = c(200, -Inf), high.thresholds = c(2500, 0.05))

pbmc <- NormalizeData(object = pbmc, normalization.method = "LogNormalize", 
    scale.factor = 10000)

Subset Genes by Variance

** Important!**: * In ScaleData… + Specify do.par = F (unless you have parallel processing set up properly, this will cause your script to crash) + Specify num.cores = nCores (to use all available cores, determined by parallel::detectCores())

Regress out: number of unique transcripts (nUMI), % mitochondrial transcripts (percent.mito)

# Store the top most variable genes in @var.genes
pbmc <- FindVariableGenes(object = pbmc, mean.function = ExpMean, dispersion.function = LogVMR,
    x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)

# IMPORTANT!: Must set do.par=T and num.cors = n for large datasets being processed on computing clusters
# IMPORTANT!: Use only the var.genes identified by 'FindVariableGenes' as the 'gene.use' arg in 'ScaleData'
## This will greatly reduced the computational load.

# par.Cores <- ifelse(nCores <= 12, 48, nCores)
pbmc <- ScaleData(object = pbmc, genes.use = pbmc@var.genes, vars.to.regress = c("nUMI", "percent.mito"), 
                  do.par = F, num.cores = nCores)
## Regressing out: nUMI, percent.mito
## Warning in RegressOutResid(object = object, vars.to.regress =
## vars.to.regress, : For parallel processing, please set do.par to TRUE.
## 
## Time Elapsed:  5.39693117141724 secs
## Scaling data matrix

Filtered Dimensions

pbmc
## An object of class seurat in project RAJ_13357 
##  24914 genes across 495 samples.

Diagnostic Plots

Violin Plots

vp <- VlnPlot(object = pbmc, features.plot = c("nGene", "nUMI", "percent.mito"),nCol = 3, do.return = T) %>% + ggplot2::aes(alpha=0.5)
vp

Gene Plots

percent.mito plot

# par(mfrow = c(1, 2))
do.hover <- ifelse(interactive==T, T, F)
gp1 <- GenePlot(object = pbmc, gene1 = "nUMI", gene2 = "percent.mito", pch.use=20, 
         do.hover=do.hover, data.hover = "mut")
gp1
## NULL

nGene plot

do.hover <-ifelse(interactive==T, T, F)
gp2 <- GenePlot(object = pbmc, gene1 = "nUMI", gene2 = "nGene", pch.use=20, 
         do.hover=do.hover, data.hover = "mut")
gp2
## NULL

Dimensionality Reduction

PCA

ProjectPCA scores each gene in the dataset (including genes not included in the PCA) based on their correlation with the calculated components. Though we don’t use this further here, it can be used to identify markers that are strongly correlated with cellular heterogeneity, but may not have passed through variable gene selection. The results of the projected PCA can be explored by setting use.full=T in the functions above

  • Other Dim Reduction Methods in Seurat
  • RunCCA()
  • RunMultiCCA()
  • RunDiffusion()
  • RunPHATE()
  • RunICA()
# Run PCA with only the top most variables genes
pbmc <- RunPCA(object = pbmc, pc.genes = pbmc@var.genes, do.print=F, verbose=T) #, pcs.print = 1:5,  genes.print = 5
## Working dimension size 27
## Initializing starting vector v with samples from standard normal distribution.
## Use `set.seed` first for reproducibility.
## irlba: using fast C implementation
# Store in Seurat object so you don't have to recalculate it for the tSNE/UMAP steps
pbmc <- ProjectPCA(object = pbmc, do.print=F) 

VizPCA

VizPCA(object = pbmc, pcs.use = 1:2)

PCA plot

do.hover <-ifelse(interactive==T, T, F)
PCAPlot(object = pbmc, dim.1 = 1, dim.2 = 2, do.hover=do.hover, data.hover="mut")

PCHeatmaps

# 'PCHeatmap' is a wrapper for heatmap.2  
PCHeatmap(object = pbmc, pc.use = 1:12, cells.use = 500, do.balanced=T, label.columns=F, use.full=F)  

Significant PCs

Determine statistically significant PCs for further analysis. NOTE: This process can take a long time for big datasets, comment out for expediency. More approximate techniques such as those implemented in PCElbowPlot() can be used to reduce computation time

#pbmc <- JackStraw(object = pbmc, num.replicate = 100, display.progress = FALSE)
PCElbowPlot(object = pbmc)

Find Cell Clusters

We first construct a KNN graph based on the euclidean distance in PCA space, and refine the edge weights between any two cells based on the shared overlap in their local neighborhoods (Jaccard similarity). To cluster the cells, we apply modularity optimization techniques such as the Louvain algorithm (default) or SLM [SLM, Blondel et al., Journal of Statistical Mechanics], to iteratively group cells together, with the goal of optimizing the standard modularity function.

On Resolution
The FindClusters function implements the procedure, and contains a resolution parameter that sets the ‘granularity’ of the downstream clustering, with increased values leading to a greater number of clusters. We find that setting this parameter between 0.6-1.2 typically returns good results for single cell datasets of around 3K cells. Optimal resolution often increases for larger datasets. The clusters are saved in the object@ident slot.

  • To further increase speed, you can employ an approximate nearest neighbor search via the RANN package by increasing the nn.eps + parameter. Setting this at 0 (the default) represents an exact neighbor search.
  • By default, we perform 100 random starts for clustering and select the result with highest modularity. You can lower this through the n.start parameter to reduce clustering time.
# TRY DIFFERENT RESOLUTIONS
pbmc <- StashIdent(object = pbmc, save.name = "pre_clustering") 
# pbmc <- SetAllIdent(object = pbmc, id = "pre_clustering") 

pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10, 
                     resolution = resolution, print.output = T, save.SNN = T, 
                     n.start = 10, nn.eps = 0.5) 
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 495
## Number of edges: 37091
## 
## Running Louvain algorithm...
## Random start: 1
## Iteration: 1
## Modularity: 0.5038
## Iteration: 2
## Modularity: 0.5038
## 
## Random start: 2
## Iteration: 1
## Modularity: 0.5038
## Iteration: 2
## Modularity: 0.5038
## 
## Random start: 3
## Iteration: 1
## Modularity: 0.5038
## Iteration: 2
## Modularity: 0.5038
## 
## Random start: 4
## Iteration: 1
## Modularity: 0.5038
## Iteration: 2
## Modularity: 0.5038
## 
## Random start: 5
## Iteration: 1
## Modularity: 0.5038
## Iteration: 2
## Modularity: 0.5038
## 
## Random start: 6
## Iteration: 1
## Modularity: 0.5038
## Iteration: 2
## Modularity: 0.5038
## 
## Random start: 7
## Iteration: 1
## Modularity: 0.5020
## Iteration: 2
## Modularity: 0.5039
## Iteration: 3
## Modularity: 0.5039
## 
## Random start: 8
## Iteration: 1
## Modularity: 0.5038
## Iteration: 2
## Modularity: 0.5038
## 
## Random start: 9
## Iteration: 1
## Modularity: 0.5039
## Iteration: 2
## Modularity: 0.5039
## 
## Random start: 10
## Iteration: 1
## Modularity: 0.5038
## Iteration: 2
## Modularity: 0.5038
## 
## Maximum modularity in 10 random starts: 0.5039
## Number of communities: 3
## Elapsed time: 0 seconds
PrintFindClustersParams(object = pbmc) 
## Parameters used in latest FindClusters calculation run on: 2019-01-14 18:19:39
## =============================================================================
## Resolution: 0.6
## -----------------------------------------------------------------------------
## Modularity Function    Algorithm         n.start         n.iter
##      1                   1                 10              10
## -----------------------------------------------------------------------------
## Reduction used          k.param          prune.SNN
##      pca                 30                0.0667
## -----------------------------------------------------------------------------
## Dims used in calculation
## =============================================================================
## 1 2 3 4 5 6 7 8 9 10
pbmc <- StashIdent(object = pbmc, save.name = "post_clustering") 

UMAP

Additional UMAP arguments detailed here: https://umap-learn.readthedocs.io/en/latest/api.html#module-umap.umap_

pbmc <- RunUMAP(object = pbmc, reduction.use = "pca", dims.use = 1:10, verbose=TRUE) # , num_threads=0 
# Plot results
DimPlot(object = pbmc, reduction.use = 'umap')

t-SNE

As input to the tSNE, we suggest using the same PCs as input to the clustering analysis, although computing the tSNE based on scaled gene expression is also supported using the genes.use argument.

** Important!**: Specify num_threads=0 in ‘RunTSNE’ to use all available cores.

“FItSNE”, a new fast implementation of t-SNE, is also available through RunTSNE. However FItSNE must first be setup on your computer.

labSize <- 6 

pbmc <- RunTSNE(object=pbmc,  reduction.use = "pca", dims.use = 1:10, do.fast = TRUE,
                tsne.method = "Rtsne", num_threads=0, verbose=T) #   FItSNE
## Read the 495 x 10 data matrix successfully!
## OpenMP is working. 4 threads.
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.08 seconds (sparsity = 0.271776)!
## Learning embedding...
## Iteration 50: error is 60.051605 (50 iterations in 0.15 seconds)
## Iteration 100: error is 60.036368 (50 iterations in 0.17 seconds)
## Iteration 150: error is 60.035447 (50 iterations in 0.18 seconds)
## Iteration 200: error is 60.038151 (50 iterations in 0.20 seconds)
## Iteration 250: error is 60.034362 (50 iterations in 0.19 seconds)
## Iteration 300: error is 1.457804 (50 iterations in 0.13 seconds)
## Iteration 350: error is 1.404030 (50 iterations in 0.12 seconds)
## Iteration 400: error is 1.384841 (50 iterations in 0.12 seconds)
## Iteration 450: error is 1.376797 (50 iterations in 0.12 seconds)
## Iteration 500: error is 1.373920 (50 iterations in 0.12 seconds)
## Iteration 550: error is 1.369812 (50 iterations in 0.12 seconds)
## Iteration 600: error is 1.369210 (50 iterations in 0.12 seconds)
## Iteration 650: error is 1.366530 (50 iterations in 0.12 seconds)
## Iteration 700: error is 1.364956 (50 iterations in 0.12 seconds)
## Iteration 750: error is 1.365638 (50 iterations in 0.11 seconds)
## Iteration 800: error is 1.366773 (50 iterations in 0.12 seconds)
## Iteration 850: error is 1.367247 (50 iterations in 0.11 seconds)
## Iteration 900: error is 1.367008 (50 iterations in 0.11 seconds)
## Iteration 950: error is 1.367663 (50 iterations in 0.12 seconds)
## Iteration 1000: error is 1.367784 (50 iterations in 0.12 seconds)
## Fitting performed in 2.65 seconds.
# note that you can set do.label=T to help label individual clusters
TSNEPlot(object = pbmc, do.label=T, label.size = labSize, do.return=T) 

# ifelse(interactive, tp %>% ggplotly() %>% toWebGL() %>% print(), print(tp))

t-SNE + Metadata Plots

tSNE_metadata_plot <- function(var){
  cat(paste("t-SNE Metadata plot for ", var))
  # Metadata plot 
  p1 <- TSNEPlot(pbmc, do.return = T,  do.label = T,  group.by = var, pt.size=1,
                 plot.title=paste("Color by ",var), vector.friendly=T) + theme(legend.position = "top")
     # layout(legend = list(orientation = 'h', xanchor = "center", x = 0.5, y = .999)) 
     
  # t-SNE clusters plot
  p2 <- TSNEPlot(pbmc, do.return = T, do.label = T, pt.size=1,
                 plot.title=paste("Color by Clusters"), vector.friendly=T)  + theme(legend.position = "top")
  # layout(legend = list(orientation = 'h', xanchor = "center", x = 0.5, y = .999))  
  #print(plot_grid(ggplotly(p1), ggplotly(p2)))
  if(interactive==T){
    fluidPage( 
      fluidRow(
        column(6, p1), column(6, p2) 
      )
    )
  } else{return(plot_grid(p1,p2))}
}   
# metaVars <- c(dx","mut","Gender","Age")
# 
# for (var in metaVars){
#   print(paste("t-SNE Metadata plot for ",var))
#   # Metadata plot 
#   p1 <- TSNEPlot(pbmc, do.return = T, pt.size = 0.5, group.by = var, do.label = T, 
#                  dark.theme=F, plot.title=paste("Color by ",var))
#   # t-SNE clusters plot
#   p2 <- TSNEPlot(pbmc, do.label = T, do.return = T, pt.size = 0.5, plot.title=paste("Color by t-SNE clusters"))
#   print(plot_grid(p1, p2))
# }   

tSNE Disease

tSNE_metadata_plot("dx") 
## t-SNE Metadata plot for  dx

Mutations

tSNE_metadata_plot("mut") 
## t-SNE Metadata plot for  mut

Gender

tSNE_metadata_plot("Gender") 
## t-SNE Metadata plot for  Gender

Age

tSNE_metadata_plot("Age") 
## t-SNE Metadata plot for  Age

### Individual ID

tSNE_metadata_plot("ID")  
## t-SNE Metadata plot for  ID

Cluster Biomarkers

Seurat has several tests for differential expression which can be set with the test.use parameter (see the DE vignette for details). For example, the ROC test returns the ‘classification power’ for any individual marker (ranging from 0 - random, to 1 - perfect).

Shown here: Biomarkers of each cluster vs. all other clusters.

Biomarkers Data

All Biomarkers

# Limit to only top variable genes?: 
# Set arg 'only.pos=F' to capture negative biomarkers
pbmc.markers <- FindAllMarkers(object = pbmc, min.pct = 0.25, thresh.use = 0.25,  only.pos = F,  test.use = "wilcox")
pbmc.markers <- pbmc.markers %>% mutate(FC = 2^avg_logFC)
createDT(pbmc.markers, caption = paste("All Biomarkers: All Clusters"))

Top Biomarkers

topNum = 5
topBiomarkers <- pbmc.markers %>% group_by(cluster) %>% top_n(topNum, avg_logFC)
createDT(pbmc.markers, caption = paste("All Biomarkers: All Clusters"))

Cluster Biomarker: Violin Plots

getTopBiomarker <- function(pbmc.markers, clusterID, topN=1){
  df <-pbmc.markers %>%
    subset(p_val_adj<0.05 & cluster==as.character(clusterID)) %>%
    arrange(desc(avg_logFC))
    top_pct_markers <- df[1:topN,"gene"]
  return(top_pct_markers)
}
# clust1_biomarkers <- getTopBiomarker(pbmc.markers, clusterID=1, topN=2)
# clust2_biomarkers <- getTopBiomarker(pbmc.markers, clusterID=2, topN=2)


### Plot biomarkers 
plotBiomarkers <- function(pbmc, biomarkers, cluster){
  biomarkerPlots <- list()
  for (marker in biomarkers){ 
    p <- VlnPlot(object = pbmc, features.plot = c(marker), y.log=T, return.plotlist=T) 
    biomarkerPlots[[marker]] <- p + ggplot2::aes(alpha=0.5) + xlab( "Cluster") + ylab( "Expression")
  }
  combinedPlot <- do.call(grid.arrange, c(biomarkerPlots, list(ncol=2, top=paste("Top DEG Biomarkers for Cluster",cluster))) ) 

  # biomarkerPlots <- lapply(biomarkers, function(marker) {
  #   VlnPlot(object = pbmc, features.plot = c(marker), y.log=T, return.plotlist=T) %>% + ggplot2::ggtitle(marker) %>% ggplotly() 
  # })    
  # return(subplot(biomarkerPlots) )
}   

top1 <- pbmc.markers %>% group_by(cluster) %>% top_n(1, avg_logFC) 
nCols <- floor( sqrt(length(unique(top1$cluster))) )   
figHeight <- nCols *7

# Plot top 2 biomarker genes for each 
for (clust in unique(pbmc.markers$cluster)){ 
   cat('\n')   
   cat("### Cluster ",clust,"\n") 
   biomarkers <- getTopBiomarker(pbmc.markers, clusterID=clust, topN=2)
   plotBiomarkers(pbmc, biomarkers, clust)  
   cat('\n')   
} 

Cluster 0

Cluster 1

Cluster 2

Cluster Biomarker: Volcano Plots

##Construct the plot object
volcanoPlot <- function(DEG_df, caption="", topFC_labeled=5){
  DEG_df$sig<-  ifelse( DEG_df$p_val_adj<0.05 & DEG_df$avg_logFC<1.5, "p_val_adj<0.05",
            ifelse( DEG_df$p_val_adj<0.05  & DEG_df$avg_logFC>1.5, "p_val_adj<0.05 & avg_logFC>1.5",
                "p_val_adj>0.05"
        )) 
  DEG_df <- arrange(DEG_df, desc(sig))
  
  vol <- ggplot(data=DEG_df, aes(x=avg_logFC, y= -log10(p_val_adj))) +
    geom_point(alpha=0.5, size=3, aes(col=sig)) + 
    scale_color_manual(values=list("p_val_adj<0.05"="turquoise3",
                                   "p_val_adj<0.05 & avg_logFC>1.5"="purple", 
                                   "p_val_adj>0.05" = "darkgray")) +
    theme(legend.position = "none") + 
    xlab(expression(paste("Average ",log^{2},"(fold change)"))) +
    ylab(expression(paste(-log^{10},"(p-value)"))) + xlim(-2,2) + 
    ## ggrepl labels
    geom_text_repel(data= arrange(DEG_df,  p_val_adj, desc(avg_logFC))[1:topFC_labeled,], 
                    # filter(DEG_df, avg_logFC>=1.5)[1:10,],
                    aes(label=gene),  color="black", alpha=.5,
                    segment.color="black", segment.alpha=.5  
                    ) +  
    # Lines
    geom_vline(xintercept= -1.5,lty=4, lwd=.3, alpha=.5) + 
    geom_vline(xintercept= 1.5,lty=4, lwd=.3, alpha=.5) +
    geom_hline(yintercept= -log10(0.05),lty=4, lwd=.3, alpha=.5) + 
    ggtitle(caption) 
  cat(vol)
}

for (clust in unique(pbmc.markers$cluster)){
   cat('\n')   
   cat("### Cluster ",clust,": Volcano") 
   cap <- paste("Cluster",clust,"DEG Table") 
   DEG_df <- subset(pbmc.markers, cluster==as.character(clust)) %>% arrange(desc(avg_logFC))  
   volcanoPlot(DEG_df, caption = cap)
   createDT(DEG_df, caption = cap)
   cat('\n')   
}
## 
## ### Cluster  0 : Volcano
## Error in cat(vol): argument 1 (type 'list') cannot be handled by 'cat'

Top Biomarker Plot

Biomarkers tSNE

fp <- FeaturePlot(object = pbmc, features.plot = top1$gene, cols.use = c("grey", "purple"), 
    reduction.use = "tsne", nCol = nCols, do.return = T)

Biomarkers Heatmap

top5 <- pbmc.markers %>% group_by(cluster) %>% top_n(5, avg_logFC)
# setting slim.col.label to TRUE will print just the cluster IDS instead of
# every cell name
DoHeatmap(object = pbmc, genes.use = top5$gene, slim.col.label=T, remove.key=T) 

# ifelse(interactive, p %>% ggplotly() %>% toWebGL() %>% print(), print(p))

Biomarkers Ridgeplot

RidgePlot(pbmc, features.plot = top1$gene,  nCol = nCols, do.sort = F)
## Picking joint bandwidth of 0.291
## Picking joint bandwidth of 0.13
## Picking joint bandwidth of 0.0842

Biomarkers Split Dot Plot

Visualize biomarker expression for each cluster, by disease

top2 <- pbmc.markers %>% group_by(cluster) %>% top_n(2, avg_logFC)

sdp <- SplitDotPlotGG(pbmc, genes.plot = top2$gene, cols.use = c("blue","red"), 
                      x.lab.rot = T, plot.legend = T, dot.scale = 8, do.return = T, grouping.var = "dx")

Map Clusters to Known Biomarkers

  • Known Monocytes Biomarkers
  • Classical: CD14++ / CD16–
  • Intermediate: CD14++ / CD16+
  • Nonclassical: CD14+ / CD16++ (not captured in this data)

The following plots show the absolute expression of each biomarker, as opposed to avg_logFC which is dependent on the expression patterns of other cell types being compared.

Markers Dataframe

markerList <- c("CD14", "FCGR3A") 
 
get_markerDF <- function(pbmc, markerList, meta_vars =c("barcode", "dx", "mut","post_clustering", "percent.mito","nGene", "nUMI")){
  exp <- pbmc@scale.data %>% data.frame() 
  marker.matrix <- exp[row.names(exp) %in% markerList, ] 
  marker.matrix$Gene <- row.names(marker.matrix)
  markerMelt <- reshape2:::melt.data.frame(marker.matrix, id.vars = "Gene", variable.name = "Cell",value.name = "Expression") 
  metaSelect <-  pbmc@meta.data[,meta_vars] 
  markerDF <- merge(markerMelt,metaSelect, by.x="Cell", by.y="barcode") 
  return(markerDF)
}
markerDF <- get_markerDF(pbmc, markerList)
createDT(markerDF, caption = "Known Marker Expression")

Marker ANOVAs + Boxplots

# Explore expression differences between groups
marker_vs_metadata <- function(markerDF, meta_var){ 
  # Create title from ANOVA summary
  ANOVAtitle <- function(markerDF, marker){
      nTests <- length(unique(markerDF$Gene))
      res <- anova(lm(data = subset(markerDF, Gene==marker), 
                      formula = Expression ~ eval(parse(text=meta_var))))
      
      title <-paste(paste("ANOVA (",marker, " vs. ",meta_var, ")", sep=""), 
                    ": p=",round(res$`Pr(>F)`,3), 
                    ", F=",round(res$`F value`,3), 
        ifelse(res$`Pr(>F)`<.05/nTests,"(Significant**)",
               "(Non-significant)") ) 
  }
  
  title = ""
  for (marker in unique(markerDF$Gene) ){
    cat(marker)
    title <- paste(title, "\n", ANOVAtitle(markerDF, marker))
  } 
  
  ggplot(markerDF, aes(x=eval(parse(text=meta_var)), y=Expression, fill= Gene)) + 
    geom_boxplot() +  
    labs(title = title, x=meta_var) +
    theme(plot.title = element_text( size=10)) +
    scale_fill_manual(values=c("brown", "slategray"))
}

ANOVA: dx

marker_vs_metadata(markerDF, "dx")
## CD14FCGR3A

ANOVA: mut

marker_vs_metadata(markerDF, "mut") 
## CD14FCGR3A

Defining Cell-types

Identify Cell Types By DEGs

  • Classification Approach 1
  • Go through each cluster and see what biomarkers they have through DGE.
  • If a cluster has a known gene marker within their top N biomarkers (with the right valence +/-), classify all cells within that cluster as a given cell type.
  • Drawback: The biomarkers that DGE identifies for each cluster will depend on the composition of the other cells. So some clusters may remain unidentified simply by the fact that they didn’t have the proper cells to be compared to.
identify_cellTypes_by_biomarkers <- function(pbmc.markers, topN_search=5){
  top <- pbmc.markers %>% group_by(cluster) %>% top_n(topN_search, avg_logFC) 
  clust_cellTypes <- list()
  for (clust in top$cluster){ 
    clustSub <- top[top5$cluster==clust, ]
    CD16_logFC <- subset(clustSub, gene=="CFD")$avg_logFC 
    
    cellType <- ifelse(sum(markerList %in% clustSub$gene), # Both CD14 and CD16? Great, keep going
           ifelse(CD16_logFC == abs(CD16_logFC), "CD14++/CD16+",  # But does CD16 have pos logfC? If so, then it's "CD14++/CD16+"
                  "CD14++/CD16--"), # Otherwise, it means it means CD16 logFC is neg, meaning "CD14++/CD16--"
           NA) # If it's none of these, it's an undefined cell type 
     clust_cellTypes[clust] <- as.factor(cellType)
  }
  newMeta <- pbmc@meta.data
  newMeta["CellType_DGE"] <- plyr::mapvalues(newMeta$post_clustering, names(clust_cellTypes), as.character(clust_cellTypes) )  
  pbmc <- AddMetaData(pbmc, metadata = newMeta)
  return(pbmc)
} 
pbmc <- identify_cellTypes_by_biomarkers(pbmc.markers, 5)

# (Doesn't make sense to do bar plot because whole clusters are defined by their biomarkers)

tSNE_metadata_plot("CellType_DGE")
## t-SNE Metadata plot for  CellType_DGE

Identify Cell Types By Average Expression

  • Classification Approach 2
  • Alternatively, you could classify cells by their absolute expression values. E.g. If any cell expresses CD16 more than the average expression of CD16 across all cells, then classify it as CD16+.
  • Drawback: The threshold for classifying a cell as either CD16+ or CD16– is defined rather arbitrarily (the average expression of CD16 in this sample isn’t necessarily biologically meaningful).
# A simplistic way of categorizing cells into CD14++/CD16+ and CD14++/CD16--, 
## is by splitting cells into groups based on whether their expression is 
## higher or lower than the average CD16 expression of all cells.
identify_cellTypes_by_avgExpression <- function(pbmc, markerDF){
  avgMarkerExp <-markerDF %>% group_by(Gene) %>% dplyr::summarise(meanExp = mean(Expression))
  avgMarkerExp <- setNames(avgMarkerExp$meanExp, avgMarkerExp$Gene)
  CD16 <- markerDF[markerDF$Gene=="FCGR3A",]
  CD16_group <- ifelse(CD16$Expression >= avgMarkerExp["FCGR3A"], "CD14++/CD16+", "CD14++/CD16--")
  CD16["CellType_AvgExp"]  <- as.factor(CD16_group)
  # Make sure row order is same before putting back into meta.data
  metaD <- pbmc@meta.data
  newMeta <- merge(metaD, CD16[,c("Cell","CellType_AvgExp")], by.x="barcode", by.y="Cell")
  row.names(newMeta) <- row.names(metaD)
  pbmc <- AddMetaData(pbmc, metadata = newMeta)
  return(pbmc)
}
pbmc <- identify_cellTypes_by_avgExpression(pbmc, markerDF)
 
# Get proportions of cell types in each cluster
cluster_proportions <- pbmc@meta.data %>% group_by(CellType_AvgExp, post_clustering) %>% 
  tally() %>%
  group_by(post_clustering, CellType_AvgExp) %>%
  mutate(percentTotal = n/sum(n)*100)

ggplot(cluster_proportions, aes(x=post_clustering, y=percentTotal, fill=CellType_AvgExp)) + geom_col(position = "fill") +  
  labs(title="Proportions of Cell-types per Cluster: \n CellType_AvgExp", 
       x="Cluster", y="Cell Type / Total Cells") +
  scale_fill_manual(values=c("brown", "slategray"))

tSNE_metadata_plot("CellType_AvgExp")
## t-SNE Metadata plot for  CellType_AvgExp

Known Biomarkers: Heatmaps

Average Expression: By Clusters

markerDF <- markerDF %>% mutate(Cluster = post_clustering)
# Show mean exp for each marker
avgMarker <- markerDF %>% group_by(Gene, Cluster) %>% summarise(meanExp = mean(Expression)) 

p <- ggplot(data = avgMarker, aes(x=Gene, y=Cluster, fill=meanExp)) %>% + geom_tile() + scale_fill_viridis()
p

# ifelse(interactive, p %>% ggplotly() %>% toWebGL() %>% print(), print(p))

Average Expression: By Disease

# Show mean exp for each marker
avgMarker <- markerDF %>% group_by(Gene, dx, Cluster) %>% summarise(meanExp = mean(Expression)) 
p <- ggplot(data = avgMarker, aes(x=Gene, y=dx, fill=meanExp)) %>% + geom_tile() + scale_fill_viridis()
p

# ifelse(interactive, p %>% ggplotly() %>% toWebGL() %>% print(), print(p))

Cells Separated

markerDF <- get_markerDF(pbmc, markerList, 
             meta_vars =c("barcode", "dx", "mut","ID","post_clustering", "percent.mito","nGene", "nUMI",
                          "CellType_DGE","CellType_AvgExp"))
Spectral <- grDevices::colorRampPalette(RColorBrewer::brewer.pal(length(unique(pbmc@meta.data$mut)), "Spectral"))

if (interactive==T){
  # Spectral <- heatmaply::Spectral(length(unique(pbmc@meta.data$mut)))
  markerMelt <- reshape2::acast(markerDF, Cell~Gene, value.var="Expression", fun.aggregate = mean, drop = F, fill = 0) 
  heatmaply::heatmaply(markerMelt,  key.title="Expression",#plot_method= "ggplot",
        k_row = dim(markerMelt)[2], dendrogram = "row",
        showticklabels = c(T, F), xlab = "Known Markers", ylab = "Cells", column_text_angle = 45, 
        row_side_colors =  pbmc@meta.data[,c("dx","mut", "CellType_DGE")], row_side_palette = Spectral
        )  %>%  colorbar(tickfont = list(size = 12), titlefont = list(size = 14), which = 2)  %>% 
        colorbar(tickfont = list(size = 12), titlefont = list(size = 14), which = 1)
}else{ 
  # markerDF_sub <-subset(markerDF, Gene==markerList[1])  
  # var_to_colors(markerDF_sub, "post_clustering")  
  # library(pheatmap)
  # pheatmap(markerMelt, annotation_row = markerDF_sub[c("dx","mut","CellType_DGE", "CellType_AvgExp")])
  # pheatmap(markerMelt, kmeans_k = NA, annotation_row = markerDF_sub[c("dx","mut","CellType_DGE", "CellType_AvgExp")],
  #         cluster_cols = F, cutree_rows = length(unique(markerDF$post_clustering)),  angle_col=45 )
  library(RColorBrewer) 
  var_to_colors <- function(markerDF, metaVar){
    colors <- brewer.pal(length(unique(markerDF[metaVar]) ), "Dark2")
     sample(colors, length(unique(markerDF[metaVar])), replace = TRUE, prob = NULL)
    # metaColors <- colors[ subset(markerDF, Gene==markerList[1])[metaVar][,1] %>% as.factor() ]
    return(metaColors)
  }  
  # library(GMD)
  # myCols = cbind(var_to_colors(markerDF, "dx"), var_to_colors(markerDF, "mut")) 
  # rlab=t(cbind(
  #   var_to_colors(markerDF, "post_clustering"),
  #   var_to_colors(markerDF, "dx")
  #   ))
  #   heatmap.2(marker.matrix, key.title="Expression",  col = viridis(300), trace="none",Colv = F, Rowv = F,
  #             labRow = F, xlab = "Biomarker", ylab="Cell", cexCol=1, RowSideColors = var_to_colors(markerDF, "post_clustering")
  #             )
  # heatmap.3(markerMelt, dendrogram = 'row', kr = length(unique(markerDF)), labRow = F, 
  #           xlab = "Biomarker", ylab = "Cell", RowSideColors = rlab, RowSideColorsSize=2 )
   
  
  
  markerDF <- markerDF %>%    
    mutate_at(vars(post_clustering, dx, mut, ID, CellType_DGE, CellType_AvgExp), as.factor) %>% 
    mutate(Cluster = post_clustering) %>%
    arrange(post_clustering) 
  # markerMelt <- reshape2::acast(markerDF, Cell~Gene, value.var="Expression", fun.aggregate = mean, drop = F, fill = 0) 
  markerMelt <- dcast(markerDF,  Cell + post_clustering + dx + mut + ID + CellType_DGE + CellType_AvgExp ~ Gene,
                      fun.aggregate = mean, value.var = "Expression") %>% arrange(post_clustering)
  marker.matrix <- markerMelt[markerList] %>%as.matrix()
  row.names(marker.matrix) <- markerMelt$Cell
  
  ha = HeatmapAnnotation(df = markerDF[c("dx","mut","ID","CellType_DGE","CellType_AvgExp")], which = "row") 
  
  ComplexHeatmap::Heatmap(marker.matrix, col=viridis(300), column_title = "Biomarker", row_title = "Cell",  
                          row_dend_reorder = F,show_row_names = F, show_column_dend = F,show_row_dend =T,
                          cluster_rows = T, column_title_side = "bottom",km = length(unique(markerMelt$post_clustering))) + ha
 

} 

Known Biomarkers: Boxplot

ggplot(data = markerDF, aes(x=Cluster, y=Expression, fill=Gene)) %>% 
  + geom_boxplot(alpha=0.5) %>% + scale_fill_manual(values=c("purple", "turquoise"))  

Known Biomarkers: tSNE

#, results = 'hide', fig.show='hide'
expressionTSNE <- function(pbmc, marker, colors=c("grey", "red")){
   FeaturePlot(object = pbmc, features.plot = marker, cols.use = colors, 
    reduction.use = "tsne", nCol=2, do.return = T, dark.theme = T)[[1]]
  # p <- ifelse(interactive, p %>% ggplotly() %>% toWebGL(), print(p)) 
}
 plot_grid(expressionTSNE(pbmc, markerList[1]),
 expressionTSNE(pbmc, markerList[2], colors=c("grey", "green")))

Label Clusters by DGE Biomarkers

current.cluster.ids <- unique(pbmc.markers$cluster) #c(0, 1, 2, 3, 4, 5, 6, 7)
top1 <- pbmc.markers %>% group_by(cluster) %>% top_n(1, avg_logFC)
new.cluster.ids <- top1$gene #c("CD4 T cells", "CD14+ Monocytes", "B cells", "CD8 T cells", "FCGR3A+ Monocytes", "NK cells", "Dendritic cells", "Megakaryocytes")

pbmc@ident <- plyr::mapvalues(x = pbmc@ident, from = current.cluster.ids, to = new.cluster.ids)
TSNEPlot(object=pbmc, do.label=T, pt.size=0.5, do.return=T) 

# ifelse(interactive, p %>% ggplotly() %>% toWebGL() %>% print(), print(p)) 

DGE: All Cells

  • DGE methods available in Seurat include:
  • DESeq2DETest()
  • DiffExpTest()
  • DiffTTest()
# Available DGE methods:
## "wilcox", "bimod", "roc", "t", "tobit", "poisson", "negbinom", "MAST", "DESeq2"
runDGE <- function(pbmc, meta_var, group1, group2, test.use="wilcox"){
  #print(paste("DGE_allCells",meta_var,sep="_")) 
  pbmc <- SetAllIdent(pbmc, id = meta_var)
  pbmc <- StashIdent(pbmc, save.name = meta_var)  
  DEGs <- FindMarkers(pbmc, ident.1=group1, ident.2=group2, test.use=test.use)
  DEGs$gene <- row.names(DEGs)
  return(DEGs)
}

PD vs. Controls

DEG_df <-runDGE(pbmc, "dx", group1 = "PD", group2="control")
cap = paste("DEGs (All Cells): PD vs. Controls")

createDT(DEG_df, caption = cap)
volcanoPlot(DEG_df, caption = cap)
## Error in cat(vol): argument 1 (type 'list') cannot be handled by 'cat'

LRRK vs. PD

DEG_df <-runDGE(pbmc, "mut", "LRRK2", "PD")
cap <- paste("DEGs (All Cells): LRRK2 vs. PD")

createDT(DEG_df, caption = cap)
volcanoPlot(DEG_df, caption = cap)
## Error in cat(vol): argument 1 (type 'list') cannot be handled by 'cat'

CD14++/CD16+ vs. CD14++/CD16–

DEG_df <-runDGE(pbmc, "CellType_DGE", "CD14++/CD16+", "CD14++/CD16--")
## Error in WhichCells(object = object, ident = ident.1): Identity : CD14++/CD16+ not found.
cap <- paste("DEGs (All Cells): CD14++/CD16+ vs. CD14++/CD16--")

createDT(DEG_df, caption = cap)
volcanoPlot(DEG_df, caption = cap)
## Error in cat(vol): argument 1 (type 'list') cannot be handled by 'cat'

DGE: Within Clusters

DGE_within_clusters <- function(pbmc, meta_var, group1, group2){
  for (clust in unique(pbmc@ident)){
    # Subset cells by cluster
   pbmc <- SetAllIdent(pbmc, id = "post_clustering")
   pbmc_clustSub <- SubsetData(pbmc, ident.use = clust, subset.raw = T) 
   cap <- paste("Cluster ",clust,": \n ",group1," vs. ", group2, sep="") 
   cat('\n')   
   cat("### ",cap)
   # DGE
   DEG_df <-runDGE(pbmc_clustSub, meta_var, group1 , group2 ) 
   # Show results
   volcanoPlot(DEG_df, caption = cap)
   createDT(DEG_df, caption = cap)
   cat('\n')   
  } 
}

Between Disease Groups

DGE_within_clusters(pbmc, "dx", "PD", "control") 
## Error in WhichCells(object = object, ident = ident.use, ident.remove = ident.remove, : Identity : S100A12 not found.

Between Mutation Groups

DGE_within_clusters(pbmc, "mut", "LRKK2", "PD")
## Error in WhichCells(object = object, ident = ident.use, ident.remove = ident.remove, : Identity : S100A12 not found.

Between Mutation Groups

DGE_within_clusters(pbmc, "CellType_DGE", "CD14++/CD16+", "CD14++/CD16--") 
## Error in WhichCells(object = object, ident = ident.use, ident.remove = ident.remove, : Identity : S100A12 not found.
DGE_within_clusters(pbmc, "CellType_AvgExp", "CD14++/CD16+", "CD14++/CD16--")
## Error in WhichCells(object = object, ident = ident.use, ident.remove = ident.remove, : Identity : S100A12 not found.

Try Different Cluster Resolutions

If you perturb some of our parameter choices above (for example, setting resolution=0.8 or changing the number of PCs), you might see the CD4 T cells subdivide into two groups. You can explore this subdivision to find markers separating the two T cell subsets. However, before reclustering (which will overwrite object@ident), we can stash our renamed identities to be easily recovered later.

Find New Clusters

new_resolution <- 3.0
orig_resolution <- paste("resolution",params$resolution,sep="_")
pbmc <- StashIdent(object = pbmc, save.name = orig_resolution) 

## Warning in BuildSNN(object = object, genes.use = genes.use, reduction.type
## = reduction.type, : Build parameters exactly match those of already
## computed and stored SNN. To force recalculation, set force.recalc to TRUE.
pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10,
                     resolution = new_resolution, print.output = F)
## 4 singletons identified. 33 final clusters.
pbmc <- StashIdent(object = pbmc, save.name = "resolution_3.0") 

plot1 <- TSNEPlot(object = pbmc, do.return = TRUE, no.legend = TRUE, do.label = TRUE, label.size=labSize)
plot2 <- TSNEPlot(object = pbmc, do.return = TRUE, group.by = "ClusterNames_0.6", 
                  no.legend = TRUE, do.label = TRUE, label.size=labSize)
## Error in FetchData(object = object, vars.all = group.by): Error: ClusterNames_0.6 not found
plot_grid(plot1, plot2)
## Error in plot_grid(plot1, plot2): object 'plot2' not found

Find New Biomarkers

res3.0_markers <- FindAllMarkers(object = pbmc, min.pct = 0.25, thresh.use = 0.25,  only.pos = F,  test.use = "wilcox")
FeaturePlot(object = pbmc, features.plot = top1$gene, cols.use = c("green", "blue"))

# Set back to orig
pbmc <- SetAllIdent(object = pbmc, id = orig_resolution) 

Save Results

# Save results for EACH run (in their respective subfolders)
saveRDS(pbmc, file=file.path(params$resultsPath, "cd14-processed.rds") )